{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "39329df3-1f99-4b11-9405-5969d52368a7",
   "metadata": {},
   "source": [
    "# Genetic Programming-Based Hyperparameter Optimization of a Decision Tree"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "66496ba0-a0c7-4819-9c6d-13daf80c8c9c",
   "metadata": {},
   "source": [
    "This notebook shows how to use [`sklearn-genetic-opt`](https://sklearn-genetic-opt.readthedocs.io/en/stable/) for hyperparameter optimization based on genetic algorithms (evolutionary programming). If you are interested in understanding how it works, `sklearn-genetic-opt` is using [DEAP](https://deap.readthedocs.io/) under the hood. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scikit-learn   : 1.0\n",
      "sklearn        : 1.0\n",
      "deap           : 1.3.1\n",
      "sklearn_genetic: 0.7.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -p scikit-learn,sklearn,deap,sklearn_genetic"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
   "metadata": {},
   "source": [
    "## Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train/Valid/Test sizes: 398 80 171\n"
     ]
    }
   ],
   "source": [
    "from sklearn import model_selection\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import datasets\n",
    "\n",
    "\n",
    "data = datasets.load_breast_cancer()\n",
    "X, y = data.data, data.target\n",
    "\n",
    "X_train, X_test, y_train, y_test = \\\n",
    "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
    "\n",
    "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
    "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
    "\n",
    "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c922b01-86f0-4e83-9e36-446f99f6fe1b",
   "metadata": {},
   "source": [
    "## sklearn-genetic-opt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72e56f33-ec33-46dd-afa2-a1b3c8b3da0b",
   "metadata": {},
   "source": [
    "- Install:  `pip install sklearn-genetic-opt[all]`\n",
    "\n",
    "- More info: https://sklearn-genetic-opt.readthedocs.io/en/stable/#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "96f0b4c1-803a-436f-93d5-31baab55faa5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gen\tnevals\tfitness \tfitness_std\tfitness_max\tfitness_min\n",
      "0  \t15    \t0.773962\t0.131052   \t0.914778   \t0.628165   \n",
      "1  \t28    \t0.888608\t0.0588224  \t0.914778   \t0.673165   \n",
      "2  \t29    \t0.911424\t0.00855215 \t0.914778   \t0.88962    \n",
      "3  \t28    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
      "4  \t28    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
      "5  \t28    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
      "6  \t29    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
      "7  \t27    \t0.918297\t0.00703797 \t0.932373   \t0.914778   \n",
      "8  \t27    \t0.922989\t0.0087779  \t0.932373   \t0.914778   \n",
      "9  \t29    \t0.928854\t0.00703797 \t0.932373   \t0.914778   \n",
      "10 \t29    \t0.932373\t3.33067e-16\t0.932373   \t0.932373   \n",
      "11 \t29    \t0.932373\t3.33067e-16\t0.932373   \t0.932373   \n",
      "12 \t29    \t0.932373\t3.33067e-16\t0.932373   \t0.932373   \n",
      "13 \t29    \t0.932861\t0.000974684\t0.93481    \t0.932373   \n",
      "14 \t29    \t0.933023\t0.00107755 \t0.93481    \t0.932373   \n",
      "15 \t28    \t0.93416 \t0.00107755 \t0.93481    \t0.932373   \n",
      "16 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
      "17 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
      "18 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
      "19 \t28    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
      "20 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9348101265822784"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import scipy.stats\n",
    "\n",
    "from sklearn_genetic import GASearchCV\n",
    "from sklearn_genetic.space import Integer, Categorical, Continuous\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "\n",
    "clf = DecisionTreeClassifier(random_state=123)\n",
    "\n",
    "params =  {\n",
    "    'min_samples_split': Integer(2, 12),\n",
    "    'min_impurity_decrease': Continuous(0.0, 0.5),\n",
    "    'max_depth': Categorical([6, 16, None])\n",
    "}\n",
    "\n",
    "search = GASearchCV(\n",
    "    estimator=clf,\n",
    "    cv=5,\n",
    "    population_size=15,\n",
    "    generations=20,\n",
    "    tournament_size=3,\n",
    "    elitism=True,\n",
    "    keep_top_k=4,\n",
    "    crossover_probability=0.9,\n",
    "    mutation_probability=0.05,\n",
    "    param_grid=params,\n",
    "    criteria='max',\n",
    "    algorithm='eaMuCommaLambda',\n",
    "    n_jobs=-1)\n",
    "\n",
    "search.fit(X_train, y_train)\n",
    "\n",
    "search.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'min_samples_split': 8,\n",
       " 'min_impurity_decrease': 0.006258039752250311,\n",
       " 'max_depth': 16}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "763e816b-6437-45a9-812f-8b429472d75e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Accuracy: 0.99\n",
      "Test Accuracy: 0.94\n"
     ]
    }
   ],
   "source": [
    "print(f\"Training Accuracy: {search.best_estimator_.score(X_train, y_train):0.2f}\")\n",
    "print(f\"Test Accuracy: {search.best_estimator_.score(X_test, y_test):0.2f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
